analysis/pattern/TestPatternTokenizer.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.pattern;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

public class TestPatternTokenizer extends BaseTokenStreamTestCase
{
  public void testSplitting() throws Exception
  {
    String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
    String[][] tests = {
      // group  pattern        input                    output
      { "-1",   "--",          "aaa--bbb--ccc",         "aaa bbb ccc" },
      { "-1",   ":",           "aaa:bbb:ccc",           "aaa bbb ccc" },
      { "-1",   "\\p{Space}",  "aaa   bbb \t\tccc  ",   "aaa bbb ccc" },
      { "-1",   ":",           "boo:and:foo",           "boo and foo" },
      { "-1",   "o",           "boo:and:foo",           "b :and:f" },
      { "0",    ":",           "boo:and:foo",           ": :" },
      { "0",    qpattern,      "aaa 'bbb' 'ccc'",       "'bbb' 'ccc'" },
      { "1",    qpattern,      "aaa 'bbb' 'ccc'",       "bbb ccc" }
    };

    for( String[] test : tests ) {
      TokenStream stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile(test[1]), Integer.parseInt(test[0]));
      ((Tokenizer)stream).setReader(new StringReader(test[2]));
      String out = tsToString( stream );
      // System.out.println( test[2] + " ==> " + out );

      assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );

      // Make sure it is the same as if we called 'split'
      // test disabled, as we remove empty tokens
      /*if( "-1".equals( test[0] ) ) {
        String[] split = test[2].split( test[1] );
        stream = tokenizer.create( new StringReader( test[2] ) );
        int i=0;
        for( Token t = stream.next(); null != t; t = stream.next() )
        {
          assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
        }
      }*/
    }
  }

  public void testOffsetCorrection() throws Exception {
    final String INPUT = "G&uuml;nther G&uuml;nther is here";

    // create MappingCharFilter
    List<String> mappingRules = new ArrayList<>();
    mappingRules.add( "\"&uuml;\" => \"ü\"" );
    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("&uuml;", "ü");
    NormalizeCharMap normMap = builder.build();
    CharFilter charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );

    // create PatternTokenizer
    Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1);
    stream.setReader(charStream);
    assertTokenStreamContents(stream,
        new String[] { "Günther", "Günther", "is", "here" },
        new int[] { 0, 13, 26, 29 },
        new int[] { 12, 25, 28, 33 },
        INPUT.length());

    charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
    stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0);
    stream.setReader(charStream);
    assertTokenStreamContents(stream,
        new String[] { "Günther", "Günther" },
        new int[] { 0, 13 },
        new int[] { 12, 25 },
        INPUT.length());
  }

  /**
   * TODO: rewrite tests not to use string comparison.
   */
  private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    in.reset();
    while (in.incrementToken()) {
      if (out.length() > 0)
        out.append(' ');
      out.append(termAtt.toString());
      in.clearAttributes();
      termAtt.setEmpty().append("bogusTerm");
    }

    in.close();
    return out.toString();
  }

  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1);
        return new TokenStreamComponents(tokenizer);
      }
    };
    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
    a.close();

    Analyzer b = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0);
        return new TokenStreamComponents(tokenizer);
      }
    };
    checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
    b.close();
  }

  // LUCENE-6814
  public void testHeapFreedAfterClose() throws Exception {
    // TODO: can we move this to BaseTSTC to catch other "hangs onto heap"ers?

    // Build a 1MB string:
    StringBuilder b = new StringBuilder();
    for(int i=0;i<1024;i++) {
      // 1023 spaces, then an x
      for(int j=0;j<1023;j++) {
        b.append(' ');
      }
      b.append('x');
    }

    String big = b.toString();

    Pattern x = Pattern.compile("x");

    List<Tokenizer> tokenizers = new ArrayList<>();
    for(int i=0;i<512;i++) {
      Tokenizer stream = new PatternTokenizer(x, -1);
      tokenizers.add(stream);
      stream.setReader(new StringReader(big));
      stream.reset();
      for(int j=0;j<1024;j++) {
        assertTrue(stream.incrementToken());
      }
      assertFalse(stream.incrementToken());
      stream.end();
      stream.close();
    }
  }
}